import os
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

sn = pd.read_csv("./snopes.csv")
pf = pd.read_csv("./politifact.csv")

sn['date_published'] = pd.to_datetime(sn['date_published'])
pf['fc_date'] = pd.to_datetime(pf['fc_date'])
len(sn) # 11639
len(pf) # 10710

statements = pd.concat([pf['claim'],sn['claim']],ignore_index=True)
len(statements) == len(pf) + len(sn)
statements.isnull().sum()

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(statements)
arr = X.toarray()
arr_pf = arr[:len(pf)]
arr_sn = arr[len(pf):]

len(arr_pf)==len(pf)
len(arr_sn)==len(sn)


''' Evaluation'''
def sim(arr1,arr2,cri):
    tfidf_sim = cosine_similarity(arr1,arr2)
    x = cri
    y1 = [any(y>x for y in tf) for tf in tfidf_sim]
    y2 = [any(y>x for y in tf) for tf in tfidf_sim.T]
    return tfidf_sim, y1, y2, sum(y1), sum(y2), (sum(y1)/len(arr1))*100, (sum(y2)/len(arr2))*100

tfidf_sim, y1, y2, sumy1, sumy2, y1p, y2p = sim(arr_sn,arr_pf,0.5)

pf['content_owner'] = "PolitiFact"

sn['overlap'] = y1
pf['overlap'] = y2

len(sn[sn['overlap']==1])

sn_sim_result = {}
sn_sim_result['sim_score'] = []
sn_sim_result['website'] = []
sn_sim_result['claim'] = []
sn_sim_result['rating'] = []
sn_sim_result['link'] = []
sn_sim_result['date'] = []
sn_sim_result['website2'] = []
sn_sim_result['claim2'] = []
sn_sim_result['rating2'] = []
sn_sim_result['link2'] = []
sn_sim_result['date2'] = []

tfidf_sim[0][tfidf_sim[0].argmax()]
# SN based similarity
for i in range(len(sn)):
    if tfidf_sim[i].max() >= 0.5:
        sn_sim_result['sim_score'].append(round(tfidf_sim[i].max(),3))
        sn_sim_result['website'].append(sn['content_owner'][i])
        sn_sim_result['claim'].append(sn['claim'][i])
        sn_sim_result['rating'].append(sn['rating'][i])
        sn_sim_result['link'].append(sn['link'][i])
        sn_sim_result['date'].append(sn['date_published'][i])
        sn_sim_result['website2'].append(pf['content_owner'][tfidf_sim[i].argmax()])
        sn_sim_result['claim2'].append(pf['claim'][tfidf_sim[i].argmax()])
        sn_sim_result['rating2'].append(pf['rating'][tfidf_sim[i].argmax()])
        sn_sim_result['link2'].append(pf['link'][tfidf_sim[i].argmax()])
        sn_sim_result['date2'].append(pf['fc_date'][tfidf_sim[i].argmax()])

sum(y1) == len(sn_sim_result['link'])
sn_simdf = pd.DataFrame(sn_sim_result)
len(sn_simdf) # 749

set(sn['rating'])
set(pf['rating'])

sn_simdf['converted_rating'] = sn_simdf['rating']
sn_simdf['converted_rating2'] = sn_simdf['rating2']

df_li = [sn_simdf]

sn_simdf.loc[sn_simdf["converted_rating"] == "true", "converted_rating"] = 'True'
sn_simdf.loc[sn_simdf["converted_rating"] == 'mostly-true', "converted_rating"] = 'Mostly True'
sn_simdf.loc[sn_simdf["converted_rating"] == 'half-true', "converted_rating"] = 'Mixture'
sn_simdf.loc[sn_simdf["converted_rating"] == 'barely-true', "converted_rating"] = 'Mostly False'
sn_simdf.loc[sn_simdf["converted_rating"] == 'false', "converted_rating"] = 'False'
sn_simdf.loc[sn_simdf["converted_rating"] == 'pants-fire', "converted_rating"] = 'False'
    
sn_simdf.loc[sn_simdf["converted_rating2"] == "TRUE", "converted_rating2"] = 'True'
sn_simdf.loc[sn_simdf["converted_rating2"] == 'mostly-true', "converted_rating2"] = 'Mostly True'
sn_simdf.loc[sn_simdf["converted_rating2"] == 'half-true', "converted_rating2"] = 'Mixture'
sn_simdf.loc[sn_simdf["converted_rating2"] == 'barely-true', "converted_rating2"] = 'Mostly False'
sn_simdf.loc[sn_simdf["converted_rating2"] == 'FALSE', "converted_rating2"] = 'False'
sn_simdf.loc[sn_simdf["converted_rating2"] == 'pants-fire', "converted_rating2"] = 'False'

set(sn_simdf['converted_rating2'])
set(sn_simdf['converted_rating'])

sn_simdf["veracity"] = sn_simdf["converted_rating"]
sn_simdf["veracity2"] = sn_simdf["converted_rating2"]

sn_simdf.loc[sn_simdf["veracity"] == 'True', "veracity"] = 'real'
sn_simdf.loc[sn_simdf["veracity"] == 'Mostly True', "veracity"] = 'real'
sn_simdf.loc[sn_simdf["veracity"] == 'Mostly False', "veracity"] = 'fake'
sn_simdf.loc[sn_simdf["veracity"] == 'False', "veracity"] = 'fake'

sn_simdf.loc[sn_simdf["veracity2"] == 'True', "veracity2"] = 'real'
sn_simdf.loc[sn_simdf["veracity2"] == 'Mostly True', "veracity2"] = 'real'
sn_simdf.loc[sn_simdf["veracity2"] == 'Mostly False', "veracity2"] = 'fake'
sn_simdf.loc[sn_simdf["veracity2"] == 'False', "veracity2"] = 'fake'


sn_simdf['rating_same'] = 0
sn_simdf['veracity_same'] = 0
for i in range(len(sn_simdf)):
    if sn_simdf['converted_rating'][i] == sn_simdf['converted_rating2'][i]:
        sn_simdf['rating_same'][i] = 1
    if sn_simdf['veracity'][i] == sn_simdf['veracity2'][i]:
        sn_simdf['veracity_same'][i] = 1

sn_simdf.name = "Snopes"

# Table 1
## Using the same method, contents for PolitiFact can be calculated.
print("Snopes")
print("Total number of claims:",len(sn))
print("Matching Claims:",len(sn_simdf))
print("Disagreed in rating level:",len(sn_simdf) - sn_simdf['rating_same'].sum())
print("Disagreed in veracity level:",len(sn_simdf) - sn_simdf['veracity_same'].sum())
